Downloading Python Modules and Libraries.

library(reticulate)
use_python("/usr/bin/python3", required = F)
#py_install("pandas")
#py_install("seaborn")
#py_install("matplotlib")
#py_install("numpy")
#py_install("pandas_profiling")
#py_install("scikit-learn")
#py_install('plotly.express', pip = T)

Importing modules and Dataset for Use.

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

heart = pd.read_csv('heart.csv')
heart['HeartDisease'] = heart['HeartDisease'].astype('bool')

Basic Visualization and Statistics

plt.hist('Age', data = heart, color = 'black',bins = 30)
## (array([ 4.,  3.,  5.,  9., 17., 11., 31., 37., 18., 43., 18., 43., 52.,
##        25., 71., 84., 41., 76., 77., 32., 66., 30., 43., 28., 10., 20.,
##         9.,  1., 10.,  4.]), array([28.        , 29.63333333, 31.26666667, 32.9       , 34.53333333,
##        36.16666667, 37.8       , 39.43333333, 41.06666667, 42.7       ,
##        44.33333333, 45.96666667, 47.6       , 49.23333333, 50.86666667,
##        52.5       , 54.13333333, 55.76666667, 57.4       , 59.03333333,
##        60.66666667, 62.3       , 63.93333333, 65.56666667, 67.2       ,
##        68.83333333, 70.46666667, 72.1       , 73.73333333, 75.36666667,
##        77.        ]), <a list of 30 Patch objects>)
plt.title('Distribution of Ages in Sample')
plt.ylabel('Frequency')
plt.xlabel('Ages (in Years)')
plt.show()

plt.scatter(heart['Cholesterol'],heart['MaxHR'], c = heart['MaxHR'], cmap = 'YlGnBu')
plt.title("Cholesterol vs Max Heart Rate")
plt.xlabel('Cholesterol')
plt.ylabel('MaxHR')
plt.show()

plot = sns.boxplot(x='ChestPainType', y='Cholesterol', data=heart, hue='ChestPainType')
plt.title('Cholesterol Levels Among Chest Pain Types')
plt.xlabel('Chest Pain Types')
plt.ylabel('Cholesterol')
plt.show()

Supervised Learning Methods (Logistic & Linear Regression)

import sklearn.model_selection
from sklearn.model_selection import train_test_split

x = heart.loc[:,['Cholesterol','MaxHR','Age']]
y = heart.loc[:,['HeartDisease']]

x_train, x_test, y_train, y_test = train_test_split(x,y)

Multivariate Linear Regression

print("Linear Regression")
## Linear Regression
import sklearn.linear_model
from sklearn.linear_model import LinearRegression

model = LinearRegression(fit_intercept=True)
model.fit(x_train, y_train)
## LinearRegression()
print("Linear Regression Model Equation")
## Linear Regression Model Equation
print(model.coef_)
## [[-0.0006631  -0.00617875  0.00860498]]
print(model.intercept_)
## [1.08381479]
print("Linear Regression Model Accuracy")
## Linear Regression Model Accuracy
print(model.score(x_test, y_test))
## 0.13585005419790985
prediction = model.predict(x_test)

Logistic Regression

print("Logistic Regression")
## Logistic Regression
import sklearn.linear_model
from sklearn.linear_model import LogisticRegression

x_test = x_test.to_numpy()
x_train = x_train.to_numpy()

y_train = y_train.values.flatten()
y_test = y_test.values.flatten()

model = LogisticRegression()
model.fit(x_train, y_train)
## LogisticRegression()
print("Logistic Regression Model Equation")
## Logistic Regression Model Equation
print(model.coef_)
## [[-0.00369229 -0.03181104  0.0419426 ]]
print(model.intercept_)
# model.predict(x_test)
## [3.2055482]
print("Logistic Regression Model Accuracy")
## Logistic Regression Model Accuracy
print(model.score(x_test, y_test))
## 0.6565217391304348

Visualizations for single variable linear and logistic regression.

sns.lmplot(x="Age", y="Cholesterol", data=heart, x_jitter=.05, scatter_kws={"color": "lightgreen"}, line_kws={"color": "blue"});

plt.show()

sns.regplot(x = heart['Cholesterol'], y = heart['HeartDisease'], logistic = True, ci = False, scatter_kws={"color": "lightgreen"}, line_kws={"color": "blue"});

plt.show()

Decision Trees

import sklearn.tree
from sklearn.tree import DecisionTreeClassifier

import sklearn.model_selection
from sklearn.model_selection import train_test_split

x = heart.loc[:,['Cholesterol','MaxHR','Age']]
y = heart.loc[:,['HeartDisease']]

x_train, x_test, y_train, y_test = train_test_split(x,y)

x = range(1,30,5)
y = []
print("Loop to find Optimal Number of Splits")
## Loop to find Optimal Number of Splits
for i in range(1,30,5) :
    Tree = DecisionTreeClassifier(max_depth = i, random_state = 2)
    Tree.fit(x_train, y_train)
    y.append(Tree.score(x_test, y_test))
## DecisionTreeClassifier(max_depth=1, random_state=2)
## DecisionTreeClassifier(max_depth=6, random_state=2)
## DecisionTreeClassifier(max_depth=11, random_state=2)
## DecisionTreeClassifier(max_depth=16, random_state=2)
## DecisionTreeClassifier(max_depth=21, random_state=2)
## DecisionTreeClassifier(max_depth=26, random_state=2)
plt.plot(x,y, color = 'gray')
plt.title('Optimal Number of Splits')
plt.xlabel('Number of Splits')
plt.ylabel('Accuracy')
plt.show()

print("We can see from the graph that the optimal number of splits is around 6. However, this makes it that the graph is hard to read. Therefore the tree below has two splits.")
## We can see from the graph that the optimal number of splits is around 6. However, this makes it that the graph is hard to read. Therefore the tree below has two splits.
from sklearn import tree

features = x
target = y

Tree = DecisionTreeClassifier(max_depth = 2, random_state = 2)
Tree.fit(x_train, y_train)
## DecisionTreeClassifier(max_depth=2, random_state=2)
print(Tree.score(x_test, y_test))
## 0.6782608695652174
plt.figure(figsize = (50,50))
tree.plot_tree(Tree, filled = True)
plt.show()

PCA (Principle Compenent Analysis)

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler 
heart = heart[['Age','Cholesterol','RestingBP', 'HeartDisease']]
dataset = heart.select_dtypes(include = 'number')
Standardize = StandardScaler()

Standardize.fit_transform(dataset)
## array([[-1.4331398 ,  0.82507026,  0.41090889],
##        [-0.47848359, -0.17196105,  1.49175234],
##        [-1.75135854,  0.7701878 , -0.12951283],
##        ...,
##        [ 0.37009972, -0.62016778, -0.12951283],
##        [ 0.37009972,  0.34027522, -0.12951283],
##        [-1.64528563, -0.21769643,  0.30282455]])
PCA_Heart = PCA(n_components = 3) 
PCs = PCA_Heart.fit_transform(dataset)
PCA_Heart.explained_variance_ratio_
## array([0.9655264 , 0.02802355, 0.00645005])
import plotly.express as px

PCA = px.scatter(PCs[:,0], PCs[:,1], color = heart['HeartDisease'])

PCA.show()
total_var = PCA_Heart.explained_variance_ratio_.sum() * 100

fig = px.scatter_3d(
    PCs, x = 0, y = 1, z = 2, color = heart['HeartDisease'],
    title=f'Total Explained Variance: {total_var:.2f}%',
    labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'}
)
fig.show()

K-Means Clustering